In [35]:
## 사전실행코드
import polars as pl
df_spotify = (pl.read_csv("./universal_top_spotify_songs.csv", try_parse_dates = True,
null_values = [""])
.filter(pl.col('snapshot_date').dt.year() == 2024).sort('snapshot_date'))
df_spotify = (
df_spotify.with_columns(pl.when(pl.col('country').is_null() == True) ## country 열이 null이면
.then(pl.lit('WW')) ## WW로 변경
.otherwise(pl.col('country')).alias('country')) ## 아니면 원래대로
.drop_nulls()) ## 그 외 null이 들어간 행 삭제
key_levels = pl.Enum(["C", "C#", "D", "Eb", "E", "F", "F#", "G", "G#", "A", "Bb", "B"])
df_spotify = (df_spotify.with_columns(pl.col('key').cast(pl.String)
.replace(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
["C", "C#", "D", "Eb", "E", "F", "F#", "G", "G#", "A", "Bb", "B"]))
.with_columns(pl.col('key').cast(key_levels)).sort('key'))
df_spotify = (df_spotify.with_columns(pl.col('artists').str.split(', ')) ## ,를 기준으로 문자열을 분리
## 리스트의 첫 번째 아이템을 가져와서 main_vocal로 저장
.with_columns(pl.col('artists').list.get(0, null_on_oob = True).alias('main_vocal'),
pl.col('artists').list.tail(-1).alias('featuring')) ## 첫 번째 아이템을 제외한 나머지를featuring으로 저장
.with_columns(pl.when(pl.col('featuring').list.len() == 0) ## 리스트 길이가 0이면
.then(None) ## None으로 설정
.otherwise(pl.col('featuring')).name.keep())) ## 아니면 그대로 유지
import pycountry_convert as pc
def get_continent_name(nation_code: str) -> str:
if nation_code != 'WW':
continent_code = pc.country_alpha2_to_continent_code(nation_code)
else:
continent_code = 'WW'
continent_dict = {"NA": "North America","SA": "South America", "AS": "Asia", "AF": "Africa",
"OC": "Oceania", "EU": "Europe", "AQ": "Antarctica", "WW": "Global"}
return continent_dict[continent_code]
df_spotify = (df_spotify.with_columns(pl.col('country')
## 앞서 정의한 함수를 country 열에 적용
.map_elements(get_continent_name, return_dtype = pl.String).alias('continent')))
In [36]:
pl.Config(set_tbl_cols = 10, set_tbl_rows = 25)
(df_spotify.select(pl.col([pl.Int64, pl.Float64])).describe()
.transpose(include_header = True, header_name = 'columns',
column_names = ["count", "null_count", "mean", "std", "min", "25%", "50%", "75%", "max"])[1:])
Out[36]:
shape: (16, 10)
| columns | count | null_count | mean | std | min | 25% | 50% | 75% | max |
|---|---|---|---|---|---|---|---|---|---|
| str | str | str | str | str | str | str | str | str | str |
| "daily_rank" | "1281585.0" | "0.0" | "25.488521635318765" | "14.428953828238736" | "1.0" | "13.0" | "25.0" | "38.0" | "50.0" |
| "daily_movement" | "1281585.0" | "0.0" | "0.9522794040192418" | "7.048585204849036" | "-49.0" | "-1.0" | "0.0" | "2.0" | "49.0" |
| "weekly_movement" | "1281585.0" | "0.0" | "2.6433954829371444" | "11.958440767017487" | "-49.0" | "-3.0" | "0.0" | "5.0" | "49.0" |
| "popularity" | "1281585.0" | "0.0" | "75.99977215713355" | "15.612366849702711" | "0.0" | "65.0" | "79.0" | "88.0" | "100.0" |
| "duration_ms" | "1281585.0" | "0.0" | "192147.9038237807" | "49510.28735642052" | "16320.0" | "160413.0" | "184250.0" | "216338.0" | "939666.0" |
| "danceability" | "1281585.0" | "0.0" | "0.6840186434766325" | "0.13658257971471674" | "0.0" | "0.594" | "0.703" | "0.785" | "0.988" |
| "energy" | "1281585.0" | "0.0" | "0.6552570154470442" | "0.1615410243104661" | "0.0000201" | "0.557" | "0.673" | "0.768" | "0.998" |
| "loudness" | "1281585.0" | "0.0" | "-6.383302003378629" | "2.551654916926727" | "-37.334" | "-7.754" | "-5.952" | "-4.668" | "3.233" |
| "mode" | "1281585.0" | "0.0" | "0.5399329736225065" | "0.49840300103880864" | "0.0" | "0.0" | "1.0" | "1.0" | "1.0" |
| "speechiness" | "1281585.0" | "0.0" | "0.09353411915713745" | "0.08890688930938012" | "0.0" | "0.0391" | "0.0574" | "0.108" | "0.937" |
| "acousticness" | "1281585.0" | "0.0" | "0.2664934433930094" | "0.24594449685538267" | "3.45e-6" | "0.0638" | "0.183" | "0.431" | "0.996" |
| "instrumentalness" | "1281585.0" | "0.0" | "0.01710200619257404" | "0.0913520037990717" | "0.0" | "0.0" | "1.2e-6" | "0.0000754" | "0.977" |
| "liveness" | "1281585.0" | "0.0" | "0.16931368773823038" | "0.12396759653567327" | "0.0139" | "0.0957" | "0.12" | "0.201" | "0.978" |
| "valence" | "1281585.0" | "0.0" | "0.5548212007865261" | "0.2268662283516568" | "0.0" | "0.374" | "0.559" | "0.739" | "0.992" |
| "tempo" | "1281585.0" | "0.0" | "122.27016181993389" | "28.035717348594673" | "0.0" | "100.019" | "119.96" | "140.082" | "236.089" |
| "time_signature" | "1281585.0" | "0.0" | "3.907079124677645" | "0.387560985392341" | "0.0" | "4.0" | "4.0" | "4.0" | "5.0" |
In [37]:
## plotly가 주피터 노트북, 주피터 랩에서 표시되지 않는 경우 아래의 코드를 실행시키세요
import plotly.io as pio
pio.renderers.default = "notebook_connected"
import plotly.express as px
fig = px.histogram(df_spotify, x = 'daily_rank')
fig.show()
In [38]:
fig = px.histogram(df_spotify, x = 'daily_movement')
fig.show()
In [39]:
fig = px.histogram(df_spotify, x = 'weekly_movement')
fig.show()
In [40]:
fig = px.histogram(df_spotify, x = 'popularity')
fig.show()
In [41]:
fig = px.pie(df_spotify.group_by('is_explicit').len('count'), values = "count")
fig.update_traces(textinfo = "percent+label")
fig.show()
In [42]:
pl.Config(set_tbl_cols = 10, set_tbl_rows = 25)
(df_spotify.select(pl.col([pl.String, pl.List(pl.String)])). describe()
.transpose(include_header = True, header_name = 'columns',
column_names = ['count', 'null_count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])[1:])
Out[42]:
shape: (8, 10)
| columns | count | null_count | mean | std | min | 25% | 50% | 75% | max |
|---|---|---|---|---|---|---|---|---|---|
| str | str | str | str | str | str | str | str | str | str |
| "spotify_id" | "1281585" | "0" | null | null | "003vvx7Niy0yvhvHt4a68B" | null | null | null | "7zyWm8JihcIiYmfNkbzeHE" |
| "name" | "1281585" | "0" | null | null | ""Se"" | null | null | null | "한 페이지가 될 수 있게" |
| "artists" | "1281585.0" | "0.0" | null | null | null | null | null | null | null |
| "country" | "1281585" | "0" | null | null | "AE" | null | null | null | "ZA" |
| "album_name" | "1281585" | "0" | null | null | ""Cold Sweats"" | null | null | null | "黑玻璃" |
| "main_vocal" | "1281585" | "0" | null | null | "$$Double-Dolla$$" | null | null | null | "高爾宣 OSN" |
| "featuring" | "519897.0" | "761688.0" | null | null | null | null | null | null | null |
| "continent" | "1281585" | "0" | null | null | "Africa" | null | null | null | "South America" |
In [43]:
df_spotify.select(pl.col([pl.String]).unique().len())
Out[43]:
shape: (1, 6)
| spotify_id | name | country | album_name | main_vocal | continent |
|---|---|---|---|---|---|
| u32 | u32 | u32 | u32 | u32 | u32 |
| 16360 | 14550 | 73 | 11086 | 5770 | 7 |
In [44]:
fig = px.bar((df_spotify.group_by('country').agg(pl.len().alias('count'))
.with_columns(pl.when(pl.col('country') == "KR").then(pl.lit("KR"))
.when(pl.col('country') == "WW").then(pl.lit("Global")).otherwise(pl.lit("Others")).
alias('국가'))),
x = 'country', y = 'count', color = '국가')
fig.update_xaxes(categoryorder = "total descending")
fig.show()
In [45]:
df_spotify.group_by('country').len().sort('len', descending = True)
Out[45]:
shape: (73, 2)
| country | len |
|---|---|
| str | u32 |
| "IT" | 17720 |
| "DO" | 17718 |
| "NI" | 17715 |
| "CZ" | 17709 |
| "PL" | 17709 |
| "CR" | 17708 |
| "SV" | 17708 |
| "HN" | 17708 |
| "EG" | 17708 |
| "FI" | 17707 |
| "HU" | 17706 |
| "KZ" | 17706 |
| "TH" | 17706 |
| … | … |
| "IN" | 17454 |
| "MA" | 17454 |
| "LV" | 17448 |
| "CA" | 17447 |
| "US" | 17387 |
| "UA" | 17360 |
| "NZ" | 17295 |
| "LU" | 17281 |
| "VN" | 16954 |
| "UY" | 16952 |
| "VE" | 16808 |
| "GB" | 16682 |
In [46]:
fig = px.bar(df_spotify.group_by('continent').agg(pl.col('country').unique().len()),
x = 'continent', y = 'country', text = 'country')
fig.update_xaxes(categoryorder = "total descending")
fig.show()
In [47]:
(df_spotify.group_by('continent').agg(pl.col('country').unique().len()).sort('country',
descending = True))
Out[47]:
shape: (7, 2)
| continent | country |
|---|---|
| str | u32 |
| "Europe" | 29 |
| "Asia" | 17 |
| "North America" | 10 |
| "South America" | 10 |
| "Africa" | 4 |
| "Oceania" | 2 |
| "Global" | 1 |